In [6]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

from tqdm import tqdm
import os

from plotly import plotly
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
from collections import Counter
In [7]:
project_data = pd.read_csv('train_data.csv')
resource_data = pd.read_csv('resources.csv')
In [8]:
print("Number of data points in train data", project_data.shape)
print('-'*50)
print("The attributes of data :", project_data.columns.values)
Number of data points in train data (109248, 17)
--------------------------------------------------
The attributes of data : ['Unnamed: 0' 'id' 'teacher_id' 'teacher_prefix' 'school_state'
 'project_submitted_datetime' 'project_grade_category'
 'project_subject_categories' 'project_subject_subcategories'
 'project_title' 'project_essay_1' 'project_essay_2' 'project_essay_3'
 'project_essay_4' 'project_resource_summary'
 'teacher_number_of_previously_posted_projects' 'project_is_approved']
In [9]:
print("Number of data points in train data", resource_data.shape)
print(resource_data.columns.values)
resource_data.head(2)
Number of data points in train data (1541272, 4)
['id' 'description' 'quantity' 'price']
Out[9]:
id description quantity price
0 p233245 LC652 - Lakeshore Double-Space Mobile Drying Rack 1 149.00
1 p069063 Bouncy Bands for Desks (Blue support pipes) 3 14.95
In [5]:
# https://matplotlib.org/gallery/pie_and_polar_charts/pie_and_donut_labels.html#sphx-glr-gallery-pie-and-polar-charts-pie-and-donut-labels-py

y_value_counts = project_data['project_is_approved'].value_counts()
print("Number of projects thar are approved for funding ", y_value_counts[1], ", (", (y_value_counts[1]/(y_value_counts[1]+y_value_counts[0]))*100,"%)")
print("Number of projects thar are not approved for funding ", y_value_counts[0], ", (", (y_value_counts[0]/(y_value_counts[1]+y_value_counts[0]))*100,"%)")

fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(aspect="equal"))
recipe = ["Accepted", "Not Accepted"]

data = [y_value_counts[1], y_value_counts[0]]

wedges, texts = ax.pie(data, wedgeprops=dict(width=0.5), startangle=-40)

bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
kw = dict(xycoords='data', textcoords='data', arrowprops=dict(arrowstyle="-"),
          bbox=bbox_props, zorder=0, va="center")

for i, p in enumerate(wedges):
    ang = (p.theta2 - p.theta1)/2. + p.theta1
    y = np.sin(np.deg2rad(ang))
    x = np.cos(np.deg2rad(ang))
    horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
    connectionstyle = "angle,angleA=0,angleB={}".format(ang)
    kw["arrowprops"].update({"connectionstyle": connectionstyle})
    ax.annotate(recipe[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.4*y),
                 horizontalalignment=horizontalalignment, **kw)

ax.set_title("Nmber of projects that are Accepted and not accepted")

plt.show()
Number of projects thar are approved for funding  92706 , ( 84.85830404217927 %)
Number of projects thar are not approved for funding  16542 , ( 15.141695957820739 %)
In [6]:
temp = pd.DataFrame(project_data.groupby("school_state")["project_is_approved"].apply(np.mean)).reset_index()
# if you have data which contain only 0 and 1, then the mean = percentage (think about it)
temp.columns = ['state_code', 'num_proposals']

'''# How to plot US state heatmap: https://datascience.stackexchange.com/a/9620

scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
            [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]

data = [ dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = temp['state_code'],
        z = temp['num_proposals'].astype(float),
        locationmode = 'USA-states',
        text = temp['state_code'],
        marker = dict(line = dict (color = 'rgb(255,255,255)',width = 2)),
        colorbar = dict(title = "% of pro")
    ) ]

layout = dict(
        title = 'Project Proposals % of Acceptance Rate by US States',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)',
        ),
    )

fig = go.Figure(data=data, layout=layout)
offline.iplot(fig, filename='us-map-heat-map')
'''
Out[6]:
'# How to plot US state heatmap: https://datascience.stackexchange.com/a/9620\n\nscl = [[0.0, \'rgb(242,240,247)\'],[0.2, \'rgb(218,218,235)\'],[0.4, \'rgb(188,189,220)\'],            [0.6, \'rgb(158,154,200)\'],[0.8, \'rgb(117,107,177)\'],[1.0, \'rgb(84,39,143)\']]\n\ndata = [ dict(\n        type=\'choropleth\',\n        colorscale = scl,\n        autocolorscale = False,\n        locations = temp[\'state_code\'],\n        z = temp[\'num_proposals\'].astype(float),\n        locationmode = \'USA-states\',\n        text = temp[\'state_code\'],\n        marker = dict(line = dict (color = \'rgb(255,255,255)\',width = 2)),\n        colorbar = dict(title = "% of pro")\n    ) ]\n\nlayout = dict(\n        title = \'Project Proposals % of Acceptance Rate by US States\',\n        geo = dict(\n            scope=\'usa\',\n            projection=dict( type=\'albers usa\' ),\n            showlakes = True,\n            lakecolor = \'rgb(255, 255, 255)\',\n        ),\n    )\n\nfig = go.Figure(data=data, layout=layout)\noffline.iplot(fig, filename=\'us-map-heat-map\')\n'
In [7]:
temp.sort_values(by=['num_proposals'], inplace=True)
print("States with lowest % approvals")
print(temp.head(5))
print('='*50)
print("States with highest % approvals")
print(temp.tail(5))
States with lowest % approvals
   state_code  num_proposals
46         VT       0.800000
7          DC       0.802326
43         TX       0.813142
26         MT       0.816327
18         LA       0.831245
==================================================
States with highest % approvals
   state_code  num_proposals
30         NH       0.873563
35         OH       0.875152
47         WA       0.876178
28         ND       0.888112
8          DE       0.897959
In [8]:
def stack_plot(data, xtick, col2='project_is_approved', col3='total'):
    ind = np.arange(data.shape[0])
    
    plt.figure(figsize=(20,5))
    p1 = plt.bar(ind, data[col3].values)
    p2 = plt.bar(ind, data[col2].values)

    plt.ylabel('Projects')
    plt.title('Number of projects aproved vs rejected')
    plt.xticks(ind, list(data[xtick].values))
    plt.legend((p1[0], p2[0]), ('total', 'accepted'))
    plt.show()
In [9]:
def univariate_barplots(data, col1, col2='project_is_approved', top=False):
    # Count number of zeros in dataframe python: https://stackoverflow.com/a/51540521/4084039
    temp = pd.DataFrame(project_data.groupby(col1)[col2].agg(lambda x: x.eq(1).sum())).reset_index()

    # Pandas dataframe grouby count: https://stackoverflow.com/a/19385591/4084039
    temp['total'] = pd.DataFrame(project_data.groupby(col1)[col2].agg({'total':'count'})).reset_index()['total']
    temp['Avg'] = pd.DataFrame(project_data.groupby(col1)[col2].agg({'Avg':'mean'})).reset_index()['Avg']
    
    temp.sort_values(by=['total'],inplace=True, ascending=False)
    
    if top:
        temp = temp[0:top]
    
    stack_plot(temp, xtick=col1, col2=col2, col3='total')
    print(temp.head(10))
    print("="*50)
    print(temp.tail(10))
In [10]:
univariate_barplots(project_data, 'school_state', 'project_is_approved', False)
   school_state  project_is_approved  total       Avg
4            CA                13205  15388  0.858136
43           TX                 6014   7396  0.813142
34           NY                 6291   7318  0.859661
9            FL                 5144   6185  0.831690
27           NC                 4353   5091  0.855038
14           IL                 3710   4350  0.852874
10           GA                 3329   3963  0.840020
40           SC                 3385   3936  0.860010
22           MI                 2672   3161  0.845302
38           PA                 2658   3109  0.854937
==================================================
   school_state  project_is_approved  total       Avg
30           NH                  304    348  0.873563
0            AK                  290    345  0.840580
8            DE                  308    343  0.897959
29           NE                  260    309  0.841424
41           SD                  252    300  0.840000
39           RI                  243    285  0.852632
26           MT                  200    245  0.816327
28           ND                  127    143  0.888112
50           WY                   82     98  0.836735
46           VT                   64     80  0.800000
In [11]:
univariate_barplots(project_data, 'teacher_prefix', 'project_is_approved' , top=False)
  teacher_prefix  project_is_approved  total       Avg
2           Mrs.                48997  57269  0.855559
3            Ms.                32860  38955  0.843537
1            Mr.                 8960  10648  0.841473
4        Teacher                 1877   2360  0.795339
0            Dr.                    9     13  0.692308
==================================================
  teacher_prefix  project_is_approved  total       Avg
2           Mrs.                48997  57269  0.855559
3            Ms.                32860  38955  0.843537
1            Mr.                 8960  10648  0.841473
4        Teacher                 1877   2360  0.795339
0            Dr.                    9     13  0.692308
In [12]:
univariate_barplots(project_data, 'project_grade_category', 'project_is_approved', top=False)
  project_grade_category  project_is_approved  total       Avg
3          Grades PreK-2                37536  44225  0.848751
0             Grades 3-5                31729  37137  0.854377
1             Grades 6-8                14258  16923  0.842522
2            Grades 9-12                 9183  10963  0.837636
==================================================
  project_grade_category  project_is_approved  total       Avg
3          Grades PreK-2                37536  44225  0.848751
0             Grades 3-5                31729  37137  0.854377
1             Grades 6-8                14258  16923  0.842522
2            Grades 9-12                 9183  10963  0.837636
In [10]:
catogories = list(project_data['project_subject_categories'].values)
# remove special characters from list of strings python: https://stackoverflow.com/a/47301924/4084039

# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://stackoverflow.com/questions/23669024/how-to-strip-a-specific-word-from-a-string
# https://stackoverflow.com/questions/8270092/remove-all-whitespace-in-a-string-in-python
cat_list = []
for i in catogories:
    temp = ""
    # consider we have text like this "Math & Science, Warmth, Care & Hunger"
    for j in i.split(','): # it will split it in three parts ["Math & Science", "Warmth", "Care & Hunger"]
        if 'The' in j.split(): # this will split each of the catogory based on space "Math & Science"=> "Math","&", "Science"
            j=j.replace('The','') # if we have the words "The" we are going to replace it with ''(i.e removing 'The')
        j = j.replace(' ','') # we are placeing all the ' '(space) with ''(empty) ex:"Math & Science"=>"Math&Science"
        temp+=j.strip()+" " #" abc ".strip() will return "abc", remove the trailing spaces
        temp = temp.replace('&','_') # we are replacing the & value into 
    cat_list.append(temp.strip())
In [11]:
project_data['clean_categories'] = cat_list
project_data.drop(['project_subject_categories'], axis=1, inplace=True)
project_data.head(2)
Out[11]:
Unnamed: 0 id teacher_id teacher_prefix school_state project_submitted_datetime project_grade_category project_subject_subcategories project_title project_essay_1 project_essay_2 project_essay_3 project_essay_4 project_resource_summary teacher_number_of_previously_posted_projects project_is_approved clean_categories
0 160221 p253737 c90749f5d961ff158d4b4d1e7dc665fc Mrs. IN 2016-12-05 13:43:57 Grades PreK-2 ESL, Literacy Educational Support for English Learners at Home My students are English learners that are work... \"The limits of your language are the limits o... NaN NaN My students need opportunities to practice beg... 0 0 Literacy_Language
1 140945 p258326 897464ce9ddc600bced1151f324dd63a Mr. FL 2016-10-25 09:22:10 Grades 6-8 Civics & Government, Team Sports Wanted: Projector for Hungry Learners Our students arrive to our school eager to lea... The projector we need for our school is very c... NaN NaN My students need a projector to help with view... 7 1 History_Civics Health_Sports
In [15]:
univariate_barplots(project_data, 'clean_categories', 'project_is_approved', top=20)
                     clean_categories  project_is_approved  total       Avg
24                  Literacy_Language                20520  23655  0.867470
32                       Math_Science                13991  17072  0.819529
28     Literacy_Language Math_Science                12725  14636  0.869432
8                       Health_Sports                 8640  10177  0.848973
40                         Music_Arts                 4429   5180  0.855019
46                       SpecialNeeds                 3431   4226  0.811879
30     Literacy_Language SpecialNeeds                 3389   3961  0.855592
0                     AppliedLearning                 3072   3771  0.814638
36     Math_Science Literacy_Language                 1968   2289  0.859764
3   AppliedLearning Literacy_Language                 1887   2191  0.861251
==================================================
                    clean_categories  project_is_approved  total       Avg
16                    History_Civics                 1545   1851  0.834684
38         Math_Science SpecialNeeds                 1531   1840  0.832065
29      Literacy_Language Music_Arts                 1475   1757  0.839499
37           Math_Science Music_Arts                 1366   1642  0.831912
6       AppliedLearning SpecialNeeds                 1195   1467  0.814588
19  History_Civics Literacy_Language                 1271   1421  0.894441
14        Health_Sports SpecialNeeds                 1215   1391  0.873472
50                Warmth Care_Hunger                 1212   1309  0.925898
33      Math_Science AppliedLearning                 1019   1220  0.835246
4       AppliedLearning Math_Science                  855   1052  0.812738
In [16]:
from collections import Counter
my_counter = Counter()
for word in project_data['clean_categories'].values:
    my_counter.update(word.split())
In [17]:
cat_dict = dict(my_counter)
sorted_cat_dict = dict(sorted(cat_dict.items(), key=lambda kv: kv[1]))


ind = np.arange(len(sorted_cat_dict))
plt.figure(figsize=(20,5))
p1 = plt.bar(ind, list(sorted_cat_dict.values()))

plt.ylabel('Projects')
plt.title('% of projects aproved category wise')
plt.xticks(ind, list(sorted_cat_dict.keys()))
plt.show()
In [18]:
for i, j in sorted_cat_dict.items():
    print("{:20} :{:10}".format(i,j))
Warmth               :      1388
Care_Hunger          :      1388
History_Civics       :      5914
Music_Arts           :     10293
AppliedLearning      :     12135
SpecialNeeds         :     13642
Health_Sports        :     14223
Math_Science         :     41421
Literacy_Language    :     52239
In [19]:
sub_catogories = list(project_data['project_subject_subcategories'].values)
# remove special characters from list of strings python: https://stackoverflow.com/a/47301924/4084039

# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://stackoverflow.com/questions/23669024/how-to-strip-a-specific-word-from-a-string
# https://stackoverflow.com/questions/8270092/remove-all-whitespace-in-a-string-in-python

sub_cat_list = []
for i in sub_catogories:
    temp = ""
    # consider we have text like this "Math & Science, Warmth, Care & Hunger"
    for j in i.split(','): # it will split it in three parts ["Math & Science", "Warmth", "Care & Hunger"]
        if 'The' in j.split(): # this will split each of the catogory based on space "Math & Science"=> "Math","&", "Science"
            j=j.replace('The','') # if we have the words "The" we are going to replace it with ''(i.e removing 'The')
        j = j.replace(' ','') # we are placeing all the ' '(space) with ''(empty) ex:"Math & Science"=>"Math&Science"
        temp +=j.strip()+" "#" abc ".strip() will return "abc", remove the trailing spaces
        temp = temp.replace('&','_')
    sub_cat_list.append(temp.strip())
In [20]:
project_data['clean_subcategories'] = sub_cat_list
project_data.drop(['project_subject_subcategories'], axis=1, inplace=True)
project_data.head(2)
Out[20]:
Unnamed: 0 id teacher_id teacher_prefix school_state project_submitted_datetime project_grade_category project_title project_essay_1 project_essay_2 project_essay_3 project_essay_4 project_resource_summary teacher_number_of_previously_posted_projects project_is_approved clean_categories clean_subcategories
0 160221 p253737 c90749f5d961ff158d4b4d1e7dc665fc Mrs. IN 2016-12-05 13:43:57 Grades PreK-2 Educational Support for English Learners at Home My students are English learners that are work... \"The limits of your language are the limits o... NaN NaN My students need opportunities to practice beg... 0 0 Literacy_Language ESL Literacy
1 140945 p258326 897464ce9ddc600bced1151f324dd63a Mr. FL 2016-10-25 09:22:10 Grades 6-8 Wanted: Projector for Hungry Learners Our students arrive to our school eager to lea... The projector we need for our school is very c... NaN NaN My students need a projector to help with view... 7 1 History_Civics Health_Sports Civics_Government TeamSports
In [21]:
univariate_barplots(project_data, 'clean_subcategories', 'project_is_approved', top=50)
                clean_subcategories  project_is_approved  total       Avg
317                        Literacy                 8371   9486  0.882458
319            Literacy Mathematics                 7260   8325  0.872072
331  Literature_Writing Mathematics                 5140   5923  0.867803
318     Literacy Literature_Writing                 4823   5571  0.865733
342                     Mathematics                 4385   5379  0.815207
330              Literature_Writing                 3846   4501  0.854477
392                    SpecialNeeds                 3431   4226  0.811879
289                 Health_Wellness                 3131   3583  0.873849
18      AppliedSciences Mathematics                 2824   3399  0.830833
0                   AppliedSciences                 2038   2492  0.817817
==================================================
                    clean_subcategories  project_is_approved  total       Avg
304                   History_Geography                  442    540  0.818519
278      Health_LifeScience Mathematics                  434    537  0.808194
305          History_Geography Literacy                  490    533  0.919325
351              Mathematics VisualArts                  403    489  0.824131
291            Health_Wellness Literacy                  396    465  0.851613
196       EnvironmentalScience Literacy                  389    444  0.876126
127                                 ESL                  349    421  0.828979
79                   College_CareerPrep                  343    421  0.814727
17   AppliedSciences Literature_Writing                  361    420  0.859524
3    AppliedSciences College_CareerPrep                  330    405  0.814815
In [22]:
from collections import Counter
my_counter = Counter()
for word in project_data['clean_subcategories'].values:
    my_counter.update(word.split())
In [23]:
sub_cat_dict = dict(my_counter)
sorted_sub_cat_dict = dict(sorted(sub_cat_dict.items(), key=lambda kv: kv[1]))


ind = np.arange(len(sorted_sub_cat_dict))
plt.figure(figsize=(20,5))
p1 = plt.bar(ind, list(sorted_sub_cat_dict.values()))

plt.ylabel('Projects')
plt.title('% of projects aproved state wise')
plt.xticks(ind, list(sorted_sub_cat_dict.keys()))
plt.show()
In [24]:
for i, j in sorted_sub_cat_dict.items():
    print("{:20} :{:10}".format(i,j))
Economics            :       269
CommunityService     :       441
FinancialLiteracy    :       568
ParentInvolvement    :       677
Extracurricular      :       810
Civics_Government    :       815
ForeignLanguages     :       890
NutritionEducation   :      1355
Warmth               :      1388
Care_Hunger          :      1388
SocialSciences       :      1920
PerformingArts       :      1961
CharacterEducation   :      2065
TeamSports           :      2192
Other                :      2372
College_CareerPrep   :      2568
Music                :      3145
History_Geography    :      3171
Health_LifeScience   :      4235
EarlyDevelopment     :      4254
ESL                  :      4367
Gym_Fitness          :      4509
EnvironmentalScience :      5591
VisualArts           :      6278
Health_Wellness      :     10234
AppliedSciences      :     10816
SpecialNeeds         :     13642
Literature_Writing   :     22179
Mathematics          :     28074
Literacy             :     33700
In [25]:
word_count = project_data['project_title'].str.split().apply(len).value_counts()
word_dict = dict(word_count)
word_dict = dict(sorted(word_dict.items(), key=lambda kv: kv[1]))


ind = np.arange(len(word_dict))
plt.figure(figsize=(20,5))
p1 = plt.bar(ind, list(word_dict.values()))

plt.ylabel('Numeber of projects')
plt.xlabel('Numeber words in project title')
plt.title('Words for each title of the project')
plt.xticks(ind, list(word_dict.keys()))
plt.show()
In [26]:
approved_title_word_count = project_data[project_data['project_is_approved']==1]['project_title'].str.split().apply(len)
approved_title_word_count = approved_title_word_count.values

rejected_title_word_count = project_data[project_data['project_is_approved']==0]['project_title'].str.split().apply(len)
rejected_title_word_count = rejected_title_word_count.values
In [27]:
plt.boxplot([approved_title_word_count, rejected_title_word_count])
plt.xticks([1,2],('Approved Projects','Rejected Projects'))
plt.ylabel('Words in project title')
plt.grid()
plt.show()
In [28]:
plt.figure(figsize=(10,3))
sns.kdeplot(approved_title_word_count,label="Approved Projects", bw=0.6)
sns.kdeplot(rejected_title_word_count,label="Not Approved Projects", bw=0.6)
plt.legend()
plt.show()
In [12]:
project_data["essay"] = project_data["project_essay_1"].map(str) +\
                        project_data["project_essay_2"].map(str) + \
                        project_data["project_essay_3"].map(str) + \
                        project_data["project_essay_4"].map(str)
In [13]:
approved_word_count = project_data[project_data['project_is_approved']==1]['essay'].str.split().apply(len)
approved_word_count = approved_word_count.values

rejected_word_count = project_data[project_data['project_is_approved']==0]['essay'].str.split().apply(len)
rejected_word_count = rejected_word_count.values
In [31]:
plt.boxplot([approved_word_count, rejected_word_count])
plt.title('Words for each essay of the project')
plt.xticks([1,2],('Approved Projects','Rejected Projects'))
plt.ylabel('Words in project essays')
plt.grid()
plt.show()
In [32]:
plt.figure(figsize=(10,3))
sns.distplot(approved_word_count, hist=False, label="Approved Projects")
sns.distplot(rejected_word_count, hist=False, label="Not Approved Projects")
plt.title('Words for each essay of the project')
plt.xlabel('Number of words in each eassay')
plt.legend()
plt.show()
In [33]:
resource_data.head(2)
Out[33]:
id description quantity price
0 p233245 LC652 - Lakeshore Double-Space Mobile Drying Rack 1 149.00
1 p069063 Bouncy Bands for Desks (Blue support pipes) 3 14.95
In [15]:
price_data = resource_data.groupby('id').agg({'price':'sum', 'quantity':'sum'}).reset_index()
price_data.head(2)
Out[15]:
id price quantity
0 p000001 459.56 7
1 p000002 515.89 21
In [16]:
project_data = pd.merge(project_data, price_data, on='id', how='left')
In [17]:
approved_price = project_data[project_data['project_is_approved']==1]['price'].values

rejected_price = project_data[project_data['project_is_approved']==0]['price'].values
In [37]:
plt.boxplot([approved_price, rejected_price])
plt.title('Box Plots of Cost per approved and not approved Projects')
plt.xticks([1,2],('Approved Projects','Rejected Projects'))
plt.ylabel('Price')
plt.grid()
plt.show()
In [40]:
plt.figure(figsize=(10,3))
sns.distplot(approved_price, hist=False, label="Approved Projects")
sns.distplot(rejected_price, hist=False, label="Not Approved Projects")
plt.title('Cost per approved and not approved Projects')
plt.xlabel('Cost of a project')
plt.legend()
plt.show()
In [18]:
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Percentile", "Approved Projects", "Not Approved Projects"]
for i in range(0,101,5):
    x.add_row([i,np.round(np.percentile(approved_price,i), 3), np.round(np.percentile(rejected_price,i), 3)])
print(x)
+------------+-------------------+-----------------------+
| Percentile | Approved Projects | Not Approved Projects |
+------------+-------------------+-----------------------+
|     0      |        0.66       |          1.97         |
|     5      |       13.59       |          41.9         |
|     10     |       33.88       |         73.67         |
|     15     |        58.0       |         99.109        |
|     20     |       77.38       |         118.56        |
|     25     |       99.95       |        140.892        |
|     30     |       116.68      |         162.23        |
|     35     |      137.232      |        184.014        |
|     40     |       157.0       |        208.632        |
|     45     |      178.265      |        235.106        |
|     50     |       198.99      |        263.145        |
|     55     |       223.99      |         292.61        |
|     60     |       255.63      |        325.144        |
|     65     |      285.412      |         362.39        |
|     70     |      321.225      |         399.99        |
|     75     |      366.075      |        449.945        |
|     80     |       411.67      |        519.282        |
|     85     |       479.0       |        618.276        |
|     90     |       593.11      |        739.356        |
|     95     |      801.598      |        992.486        |
|    100     |       9999.0      |         9999.0        |
+------------+-------------------+-----------------------+
In [42]:
resource_data.head(2)
Out[42]:
id description quantity price
0 p233245 LC652 - Lakeshore Double-Space Mobile Drying Rack 1 149.00
1 p069063 Bouncy Bands for Desks (Blue support pipes) 3 14.95
In [43]:
price_data = resource_data.groupby('id').agg({'price':'sum', 'quantity':'sum'}).reset_index()
price_data.head(2)
Out[43]:
id price quantity
0 p000001 459.56 7
1 p000002 515.89 21

1.2.9 UNIVARIATE FOR TEACHER NUMBER OF PREVIOUSLY POSTED PROJECT

In [44]:
univariate_barplots(project_data, 'teacher_number_of_previously_posted_projects', 'project_is_approved', top=25)
   teacher_number_of_previously_posted_projects  project_is_approved  total  \
0                                             0                24652  30014   
1                                             1                13329  16058   
2                                             2                 8705  10350   
3                                             3                 5997   7110   
4                                             4                 4452   5266   
5                                             5                 3536   4171   
6                                             6                 2911   3404   
7                                             7                 2298   2691   
8                                             8                 1977   2293   
9                                             9                 1700   1959   

        Avg  
0  0.821350  
1  0.830054  
2  0.841063  
3  0.843460  
4  0.845423  
5  0.847758  
6  0.855170  
7  0.853958  
8  0.862189  
9  0.867790  
==================================================
    teacher_number_of_previously_posted_projects  project_is_approved  total  \
15                                            15                  818    942   
16                                            16                  769    894   
17                                            17                  712    803   
18                                            18                  666    772   
19                                            19                  632    710   
20                                            20                  578    661   
21                                            21                  519    584   
22                                            22                  495    548   
23                                            23                  479    536   
25                                            25                  456    509   

         Avg  
15  0.868365  
16  0.860179  
17  0.886675  
18  0.862694  
19  0.890141  
20  0.874433  
21  0.888699  
22  0.903285  
23  0.893657  
25  0.895874  

summary:

By this univariate anlaysis we can observe that the teacher number of previously posted projects list in top 10 and teachers

who does not propose any prior projects are also accepted.nearly 82% of the projects are approved by the teachers who did not propose any projects before, we also can see that the rate of proposal is higher if the proposed projects are atleast 20.

In [45]:
summary = []

for c in project_data["project_resource_summary"] :
    summary.append(c)
    
summary[0:10]
Out[45]:
['My students need opportunities to practice beginning reading skills in English at home.',
 'My students need a projector to help with viewing educational programs',
 'My students need shine guards, athletic socks, Soccer Balls, goalie gloves, and training materials for the upcoming Soccer season.',
 'My students need to engage in Reading and Math in a way that will inspire them with these Mini iPads!',
 'My students need hands on practice in mathematics. Having fun and personalized journals and charts will help them be more involved in our daily Math routines.',
 'My students need movement to be successful. Being that I have a variety of students that have all different types of needs, flexible seating would assist not only these students with special needs, but all students.',
 'My students need some dependable laptops for daily classroom use for reading and math.',
 'My students need ipads to help them access a world of online resources that will spark their interest in learning.',
 "My students need three devices and three management licenses for small group's easy access to newly-implemented online programs--Go Noodle Plus, for increased in-class physical activity and Light Sail, an interactive reading program.",
 'My students need great books to use during Independent Reading, Read Alouds, Partner Reading and Author Studies.']
In [46]:
len(summary)
Out[46]:
109248
In [47]:
numeric_summary_values = {}

for x in tqdm(range(len(summary))):
    for s in summary[x].split():
        if s.isdigit() :
            numeric_summary_values[x] = int(s)
100%|██████████████████████████████████████████████████████████████████████| 109248/109248 [00:00<00:00, 111434.80it/s]
In [177]:
numeric_summary_values[14]
Out[177]:
5
In [48]:
numeric_digits = {}

for c in range(len(summary)) :
    if c in numeric_summary_values.keys() :
        numeric_digits[c] = numeric_summary_values[c]
    else :
        numeric_digits[c] = 0
In [49]:
digit_in_summary = []

for a in numeric_digits.values() :
    if a > 0 :
        digit_in_summary.append(1)
    else :
        digit_in_summary.append(0)
In [50]:
digit_in_summary[0:14]
Out[50]:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
In [51]:
project_data['digit_in_summary'] = digit_in_summary
In [52]:
project_data.head(30)
Out[52]:
Unnamed: 0 id teacher_id teacher_prefix school_state project_submitted_datetime project_grade_category project_title project_essay_1 project_essay_2 ... project_essay_4 project_resource_summary teacher_number_of_previously_posted_projects project_is_approved clean_categories clean_subcategories essay price quantity digit_in_summary
0 160221 p253737 c90749f5d961ff158d4b4d1e7dc665fc Mrs. IN 2016-12-05 13:43:57 Grades PreK-2 Educational Support for English Learners at Home My students are English learners that are work... \"The limits of your language are the limits o... ... NaN My students need opportunities to practice beg... 0 0 Literacy_Language ESL Literacy My students are English learners that are work... 154.60 23 0
1 140945 p258326 897464ce9ddc600bced1151f324dd63a Mr. FL 2016-10-25 09:22:10 Grades 6-8 Wanted: Projector for Hungry Learners Our students arrive to our school eager to lea... The projector we need for our school is very c... ... NaN My students need a projector to help with view... 7 1 History_Civics Health_Sports Civics_Government TeamSports Our students arrive to our school eager to lea... 299.00 1 0
2 21895 p182444 3465aaf82da834c0582ebd0ef8040ca0 Ms. AZ 2016-08-31 12:03:56 Grades 6-8 Soccer Equipment for AWESOME Middle School Stu... \r\n\"True champions aren't always the ones th... The students on the campus come to school know... ... NaN My students need shine guards, athletic socks,... 1 0 Health_Sports Health_Wellness TeamSports \r\n\"True champions aren't always the ones th... 516.85 22 0
3 45 p246581 f3cb9bffbba169bef1a77b243e620b60 Mrs. KY 2016-10-06 21:16:17 Grades PreK-2 Techie Kindergarteners I work at a unique school filled with both ESL... My students live in high poverty conditions wi... ... NaN My students need to engage in Reading and Math... 4 1 Literacy_Language Math_Science Literacy Mathematics I work at a unique school filled with both ESL... 232.90 4 0
4 172407 p104768 be1f7507a41f8479dc06f047086a39ec Mrs. TX 2016-07-11 01:10:09 Grades PreK-2 Interactive Math Tools Our second grade classroom next year will be m... For many students, math is a subject that does... ... NaN My students need hands on practice in mathemat... 1 1 Math_Science Mathematics Our second grade classroom next year will be m... 67.98 4 0
5 141660 p154343 a50a390e8327a95b77b9e495b58b9a6e Mrs. FL 2017-04-08 22:40:43 Grades 3-5 Flexible Seating for Mrs. Jarvis' Terrific Thi... I will be moving from 2nd grade to 3rd grade a... These flexible seating options will allow my s... ... NaN My students need movement to be successful. Be... 1 1 Literacy_Language SpecialNeeds Literature_Writing SpecialNeeds I will be moving from 2nd grade to 3rd grade a... 113.22 11 0
6 21147 p099819 9b40170bfa65e399981717ee8731efc3 Mrs. CT 2017-02-17 19:58:56 Grades 6-8 Chromebooks for Special Education Reading Program My students are a dynamic and very energetic g... My students are an engaging and active group o... ... NaN My students need some dependable laptops for d... 1 1 Literacy_Language SpecialNeeds Literacy SpecialNeeds My students are a dynamic and very energetic g... 159.99 3 0
7 94142 p092424 5bfd3d12fae3d2fe88684bbac570c9d2 Ms. GA 2016-09-01 00:02:15 Grades 3-5 It's the 21st Century Not only do our students struggle with poverty... My students need 4 iPads, the latest technolog... ... NaN My students need ipads to help them access a w... 7 1 Math_Science Mathematics Not only do our students struggle with poverty... 229.00 4 0
8 112489 p045029 487448f5226005d08d36bdd75f095b31 Mrs. SC 2016-09-25 17:00:26 Grades PreK-2 Targeting More Success in Class My students are enthusiastic and inquisitive l... My second graders need extra activity time dur... ... NaN My students need three devices and three manag... 28 1 Health_Sports Health_Wellness My students are enthusiastic and inquisitive l... 241.98 6 0
9 158561 p001713 140eeac1885c820ad5592a409a3a8994 Ms. NC 2016-11-17 18:18:56 Grades PreK-2 Just For the Love of Reading--\r\nPure Pleasure Over 95% of my students are on free or reduced... Reading is Fundamental! My students will read ... ... NaN My students need great books to use during Ind... 36 1 Literacy_Language Literacy Literature_Writing Over 95% of my students are on free or reduced... 125.36 14 0
10 43184 p040307 363788b51d40d978fe276bcb1f8a2b35 Mrs. CA 2017-01-04 16:40:30 Grades 3-5 Reading Changes Lives \"There are many little ways to enlarge your w... I've had 8 sets of students enjoy the books in... ... NaN My students need books by their favorite autho... 37 1 Literacy_Language Literacy \"There are many little ways to enlarge your w... 100.21 10 0
11 127083 p251806 4ba7c721133ef651ca54a03551746708 Ms. CA 2016-11-14 22:57:28 Grades PreK-2 Elevating Academics and Parent Rapports Throug... All of our students receive free breakfast, lu... With three chromebooks, I can teach the Common... ... NaN My students need paper, three chromebooks, and... 32 1 Literacy_Language AppliedLearning Literacy ParentInvolvement All of our students receive free breakfast, lu... 431.77 8 0
12 19090 p051126 5e52c92b7e3c472aad247a239d345543 Mrs. NY 2016-05-23 15:46:02 Grades 6-8 Building Life Science Experiences My students are always working on new projects... My Spanish Dual Language students are always r... ... NaN My students need 3D and 4D life science activi... 5 0 Math_Science EnvironmentalScience Health_LifeScience My students are always working on new projects... 219.46 22 0
13 15126 p003874 178f6ae765cd4e0fb143a77c47fd65e2 Mrs. OK 2016-10-17 09:49:27 Grades PreK-2 Everyone deserves to be heard! I teach in a small school district in central ... My students are smart, creative, and also have... ... NaN My students need access to technology that wil... 30 1 SpecialNeeds SpecialNeeds I teach in a small school district in central ... 399.99 1 0
14 62232 p233127 424819801de22a60bba7d0f4354d0258 Ms. MA 2017-02-14 16:29:10 Grades PreK-2 TABLETS CAN SHOW US THE WORLD My students are my babies...I want the world f... Having this computer in the classroom would pr... ... NaN My students need 5 tablets for our classroom t... 15 0 Literacy_Language Literacy My students are my babies...I want the world f... 91.94 10 1
15 67303 p132832 bb6d6d054824fa01576ab38dfa2be160 Ms. TX 2016-10-05 21:05:38 Grades 3-5 Making Recess Active Located in West Dallas, my students face sever... Due to the size of our school, and the tiny na... ... NaN My students need activities to play during rec... 3 1 Health_Sports Health_Wellness Located in West Dallas, my students face sever... 435.84 24 0
16 127215 p174627 4ad7e280fddff889e1355cc9f29c3b89 Mrs. FL 2017-01-18 10:59:05 Grades PreK-2 Making Great LEAP's With Leapfrog! My Preschool children, ages 3-5 years old with... Having a set of Leapfrog iPads and educational... ... NaN My students need 2 LeapPad that will engage th... 1 1 Literacy_Language SpecialNeeds Literacy SpecialNeeds My Preschool children, ages 3-5 years old with... 298.43 7 1
17 157771 p152491 e39abda057354c979c5b075cffbe5f88 Ms. NV 2016-11-23 17:14:17 Grades 3-5 Technology Teaches Tomorrow's Talents Today My students are special because they come from... Classroom ChromebookCar\r\n\r\nMy name is Shan... ... NaN My students need Chromebooks to publish writte... 0 1 Math_Science Literacy_Language AppliedSciences Literature_Writing My students are special because they come from... 158.63 12 0
18 122186 p196421 fcd9b003fc1891383f340a89da02a1a6 Mrs. GA 2016-08-28 15:04:42 Grades PreK-2 Test Time I teach at a Title I school in a low-income ar... My 2nd grade students will benefit from having... ... NaN My students need privacy partitions to use whi... 0 1 AppliedLearning EarlyDevelopment I teach at a Title I school in a low-income ar... 59.98 4 0
19 146331 p058343 8e07a98deb1bc74c75b97521e05b1691 Ms. OH 2016-08-06 13:05:20 Grades 3-5 Wiggling Our Way to Success We are apart of an urban district and many of ... Many of my students struggle to sit still for ... ... NaN My students need 7 Hokki stools to encourage a... 9 1 Health_Sports Health_Wellness We are apart of an urban district and many of ... 749.42 7 1
20 75560 p052326 e0c1aad1f71badeff703fadc15f57680 Mrs. PA 2016-10-07 18:27:02 Grades PreK-2 Magic Carpet Ride in Our Library The students in our school come from diverse b... Each week our students love visiting the schoo... ... NaN My students need carpet in our library to brig... 23 1 Literacy_Language Literacy The students in our school come from diverse b... 213.85 1 0
21 132078 p187097 2d4a4d2d774e5c2fdd25b2ba0e7341f8 Mrs. NC 2016-05-17 19:45:13 Grades 6-8 From Sitting to Standing in the Classroom My students walk into school every day full of... I want to purchase desks in my classroom that ... ... NaN My students need desks to stand at and be able... 0 1 Math_Science SpecialNeeds Health_LifeScience SpecialNeeds My students walk into school every day full of... 250.91 4 0
22 84810 p165540 30f08fbe02eba5453c4ce2e857e88eb4 Ms. CA 2016-09-01 10:09:15 Grades 9-12 Books for Budding Intellectuals Every day in my English classroom, we work to ... My students need books that interest them so t... ... NaN My students need books so that they can become... 0 0 Literacy_Language Literacy Every day in my English classroom, we work to ... 278.09 21 0
23 8636 p219330 258ef2e6ab5ce007ac6764ce15d261ba Mr. AL 2017-01-10 11:41:06 Grades 6-8 Instrumental Power: Conquering STEAM! 100% of our musical students eat free breakfas... We need classroom instruments for our band pro... ... NaN My students need these instruments to give the... 2 1 Music_Arts Music 100% of our musical students eat free breakfas... 299.98 2 0
24 21478 p126524 74f8690562c44fc88f65f845b9fe61d0 Mrs. FL 2017-03-31 12:34:44 Grades PreK-2 S.T.E.A.M. Challenges(Science Technology Engin... This year, I am teaching in an EFL (Extended F... I will use these items to create S.T.E.A.M. bi... ... NaN My students need building materials, such as g... 0 1 Math_Science AppliedSciences Mathematics This year, I am teaching in an EFL (Extended F... 250.00 6 0
25 20142 p009037 b8bf3507cee960d5fedcb27719df2d59 Mrs. AL 2017-03-09 15:36:20 Grades 3-5 Math Masters! My students are highly motivated to succeed. U... These math games will help reinforce the skill... ... NaN My students need the learning centers and mult... 11 0 Math_Science Mathematics My students are highly motivated to succeed. U... 268.99 2 0
26 33903 p040091 7a0a5de5ed94e7036946b1ac3eaa99d0 Ms. TX 2016-09-18 22:10:40 Grades PreK-2 Techy Teaching I teach 22 bright 5 and 6 year olds. My studen... The iPads will be effectively used to improve ... ... NaN My students need 2 ipad minis to enhance learn... 2 1 Literacy_Language Math_Science Literacy Mathematics I teach 22 bright 5 and 6 year olds. My studen... 280.83 4 1
27 1156 p161033 efdc3cf14d136473c9f62becc00d4cec Teacher LA 2016-11-06 16:02:31 Grades 3-5 4th Grade French Immersion Class Ipads My students spend most of their day learning f... The iPads will also be used to enhance the stu... ... NaN My students need Ipads to work in smaller grou... 2 1 Literacy_Language Math_Science ForeignLanguages Mathematics My students spend most of their day learning f... 660.84 7 0
28 35430 p085706 22c8184c4660f1c589bea061d14b7f35 Mrs. GA 2017-01-27 12:34:59 Grades 9-12 Hands-On Language and Literacy My students all have a primary diagnosis of au... Children with autism struggle in core deficit ... ... NaN My students need to increase language and lite... 5 0 Literacy_Language SpecialNeeds Literacy SpecialNeeds My students all have a primary diagnosis of au... 129.98 3 0
29 22088 p032018 45f16a103f1e00b7439861d4e0728a59 Mrs. VA 2016-07-15 12:58:40 Grades PreK-2 Basic Classroom Supplies Needed I have an awesome group of 24 students any tea... My students need basic school supplies such as... ... NaN My students need basic school supplies such as... 0 1 Literacy_Language AppliedLearning Literacy Other I have an awesome group of 24 students any tea... 86.74 53 0

30 rows × 21 columns

In [53]:
univariate_barplots(project_data, 'digit_in_summary', 'project_is_approved', top=10)
   digit_in_summary  project_is_approved  total       Avg
0                 0                82563  98012  0.842376
1                 1                10143  11236  0.902723
==================================================
   digit_in_summary  project_is_approved  total       Avg
0                 0                82563  98012  0.842376
1                 1                10143  11236  0.902723

summary:

the summaries having numeric values contain high acceptance rate it is 90% and majority of the projects are not having numerical values stating the requirement of the products.
In [54]:
project_data.head(2)
Out[54]:
Unnamed: 0 id teacher_id teacher_prefix school_state project_submitted_datetime project_grade_category project_title project_essay_1 project_essay_2 ... project_essay_4 project_resource_summary teacher_number_of_previously_posted_projects project_is_approved clean_categories clean_subcategories essay price quantity digit_in_summary
0 160221 p253737 c90749f5d961ff158d4b4d1e7dc665fc Mrs. IN 2016-12-05 13:43:57 Grades PreK-2 Educational Support for English Learners at Home My students are English learners that are work... \"The limits of your language are the limits o... ... NaN My students need opportunities to practice beg... 0 0 Literacy_Language ESL Literacy My students are English learners that are work... 154.6 23 0
1 140945 p258326 897464ce9ddc600bced1151f324dd63a Mr. FL 2016-10-25 09:22:10 Grades 6-8 Wanted: Projector for Hungry Learners Our students arrive to our school eager to lea... The projector we need for our school is very c... ... NaN My students need a projector to help with view... 7 1 History_Civics Health_Sports Civics_Government TeamSports Our students arrive to our school eager to lea... 299.0 1 0

2 rows × 21 columns

In [63]:
print(project_data['project_title'].values[0])
print("="*50)
print(project_data['project_title'].values[150])
print("="*50)
print(project_data['project_title'].values[500])
print("="*50)
print(project_data['project_title'].values[1000])
print("="*50)
print(project_data['project_title'].values[2000])
print("="*50)
print(project_data['project_title'].values[5000])
print("="*50)
print(project_data['project_title'].values[7000])
print("="*50)
print(project_data['project_title'].values[12000])
print("="*50)
print(project_data['project_title'].values[17000])
print("="*50)
print(project_data['project_title'].values[50000])
print("="*50)
print(project_data['project_title'].values[98000])
print("="*50)
Educational Support for English Learners at Home
==================================================
More Movement with Hokki Stools
==================================================
Classroom Chromebooks for College Bound Seniors!
==================================================
Sailing Into a Super 4th Grade Year
==================================================
Steady Stools for Active Learning
==================================================
Bouncing Our Wiggles and Worries Away!
==================================================
Finding Fitness
==================================================
Cooperative Learning in 4th Grade!
==================================================
Fire for Learning!
==================================================
Help our Bridgeport Students to Improve Their Listening Skills!
==================================================
Keep'em Moving!
==================================================
In [64]:
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase
In [65]:
sent = decontracted(project_data['project_title'].values[17000])
print(sent)
print("="*50)
Fire for Learning!
==================================================
In [66]:
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]
In [67]:
preprocessed_titles = []

for titles in tqdm(project_data["project_title"]):
    title = decontracted(titles)
    title = title.replace('\\r', ' ')
    title = title.replace('\\"', ' ')
    title = title.replace('\\n', ' ')
    title = re.sub('[^A-Za-z0-9]+', ' ', title)
    title = ' '.join(f for f in title.split() if f not in stopwords)
    preprocessed_titles.append(title.lower().strip())
100%|███████████████████████████████████████████████████████████████████████| 109248/109248 [00:06<00:00, 16764.65it/s]
In [68]:
print(preprocessed_titles[10000])
print("="*50)
print(preprocessed_titles[4444])
print("="*50)
family book clubs
==================================================
innovative seating creative minds part 3
==================================================

vectorizing categorical data.

In [69]:
my_counter = Counter()
for state in project_data['school_state'].values:
    my_counter.update(state.split())
In [70]:
school_state_cat_dict = dict(my_counter)
sorted_school_state_cat_dict = dict(sorted(school_state_cat_dict.items(), key=lambda kv: kv[1]))
In [71]:
vectorizer = CountVectorizer(vocabulary=list(sorted_school_state_cat_dict.keys()), lowercase=False, binary=True)
vectorizer.fit(project_data['school_state'].values)
print(vectorizer.get_feature_names())

school_state_categories_one_hot = vectorizer.transform(project_data['school_state'].values)
print("Shape of matrix after one hot encoding ",school_state_categories_one_hot.shape)
['VT', 'WY', 'ND', 'MT', 'RI', 'SD', 'NE', 'DE', 'AK', 'NH', 'WV', 'ME', 'HI', 'DC', 'NM', 'KS', 'IA', 'ID', 'AR', 'CO', 'MN', 'OR', 'KY', 'MS', 'NV', 'MD', 'CT', 'TN', 'UT', 'AL', 'WI', 'VA', 'AZ', 'NJ', 'OK', 'WA', 'MA', 'LA', 'OH', 'MO', 'IN', 'PA', 'MI', 'SC', 'GA', 'IL', 'NC', 'FL', 'NY', 'TX', 'CA']
Shape of matrix after one hot encoding  (109248, 51)
In [72]:
catogories = list(project_data['project_grade_category'].values)
# remove special characters from list of strings python: https://stackoverflow.com/a/47301924/4084039

# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://stackoverflow.com/questions/23669024/how-to-strip-a-specific-word-from-a-string
# https://stackoverflow.com/questions/8270092/remove-all-whitespace-in-a-string-in-python
cat_list = []
for i in catogories:
    temp = ""
    # consider we have text like this "Math & Science, Warmth, Care & Hunger"
    for j in i.split(','): # it will split it in three parts ["Math & Science", "Warmth", "Care & Hunger"]
        if 'The' in j.split(): # this will split each of the catogory based on space "Math & Science"=> "Math","&", "Science"
            j=j.replace('The','') # if we have the words "The" we are going to replace it with ''(i.e removing 'The')
        j = j.replace(' ','') # we are placeing all the ' '(space) with ''(empty) ex:"Math & Science"=>"Math&Science"
        temp+=j.strip()+" " #" abc ".strip() will return "abc", remove the trailing spaces
        temp = temp.replace('&','_') # we are replacing the & value into 
    cat_list.append(temp.strip())
In [73]:
project_data['clean_grade_category'] = cat_list
project_data.drop(['project_grade_category'], axis=1, inplace=True)
project_data.head(2)
Out[73]:
Unnamed: 0 id teacher_id teacher_prefix school_state project_submitted_datetime project_title project_essay_1 project_essay_2 project_essay_3 ... project_resource_summary teacher_number_of_previously_posted_projects project_is_approved clean_categories clean_subcategories essay price quantity digit_in_summary clean_grade_category
0 160221 p253737 c90749f5d961ff158d4b4d1e7dc665fc Mrs. IN 2016-12-05 13:43:57 Educational Support for English Learners at Home My students are English learners that are work... \"The limits of your language are the limits o... NaN ... My students need opportunities to practice beg... 0 0 Literacy_Language ESL Literacy My students are English learners that are work... 154.6 23 0 GradesPreK-2
1 140945 p258326 897464ce9ddc600bced1151f324dd63a Mr. FL 2016-10-25 09:22:10 Wanted: Projector for Hungry Learners Our students arrive to our school eager to lea... The projector we need for our school is very c... NaN ... My students need a projector to help with view... 7 1 History_Civics Health_Sports Civics_Government TeamSports Our students arrive to our school eager to lea... 299.0 1 0 Grades6-8

2 rows × 21 columns

In [74]:
my_counter = Counter()
for project_grade in project_data['clean_grade_category'].values:
    my_counter.update(project_grade.split())
In [75]:
project_grade_cat_dict = dict(my_counter)
sorted_project_grade_cat_dict = dict(sorted(project_grade_cat_dict.items(), key=lambda kv: kv[1]))
In [76]:
vectorizer = CountVectorizer(vocabulary=list(sorted_project_grade_cat_dict.keys()), lowercase=False, binary=True)
vectorizer.fit(project_data['clean_grade_category'].values)
print(vectorizer.get_feature_names())

project_grade_categories_one_hot = vectorizer.transform(project_data['clean_grade_category'].values)
print("Shape of matrix after one hot encoding ",project_grade_categories_one_hot.shape)
['Grades9-12', 'Grades6-8', 'Grades3-5', 'GradesPreK-2']
Shape of matrix after one hot encoding  (109248, 4)
In [79]:
catogories = list(project_data['teacher_prefix'].values)
# remove special characters from list of strings python: https://stackoverflow.com/a/47301924/4084039

# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://stackoverflow.com/questions/23669024/how-to-strip-a-specific-word-from-a-string
# https://stackoverflow.com/questions/8270092/remove-all-whitespace-in-a-string-in-python
cat_list = []
for i in catogories:
    temp = ""
    # consider we have text like this "Math & Science, Warmth, Care & Hunger"
    for j in i.split(','): # it will split it in three parts ["Math & Science", "Warmth", "Care & Hunger"]
        if 'The' in j.split(): # this will split each of the catogory based on space "Math & Science"=> "Math","&", "Science"
            j=j.replace('The','') # if we have the words "The" we are going to replace it with ''(i.e removing 'The')
        j = j.replace(' ','') # we are placeing all the ' '(space) with ''(empty) ex:"Math & Science"=>"Math&Science"
        temp+=j.strip()+" " #" abc ".strip() will return "abc", remove the trailing spaces
        temp = temp.replace('&','_') # we are replacing the & value into 
    cat_list.append(temp.strip())
In [80]:
project_data['clean_teacher_prefix'] = cat_list
project_data.drop(['teacher_prefix'], axis=1, inplace=True)
project_data.head(2)
Out[80]:
Unnamed: 0 id teacher_id school_state project_submitted_datetime project_title project_essay_1 project_essay_2 project_essay_3 project_essay_4 ... teacher_number_of_previously_posted_projects project_is_approved clean_categories clean_subcategories essay price quantity digit_in_summary clean_grade_category clean_teacher_prefix
0 160221 p253737 c90749f5d961ff158d4b4d1e7dc665fc IN 2016-12-05 13:43:57 Educational Support for English Learners at Home My students are English learners that are work... \"The limits of your language are the limits o... NaN NaN ... 0 0 Literacy_Language ESL Literacy My students are English learners that are work... 154.6 23 0 GradesPreK-2 Mrs.
1 140945 p258326 897464ce9ddc600bced1151f324dd63a FL 2016-10-25 09:22:10 Wanted: Projector for Hungry Learners Our students arrive to our school eager to lea... The projector we need for our school is very c... NaN NaN ... 7 1 History_Civics Health_Sports Civics_Government TeamSports Our students arrive to our school eager to lea... 299.0 1 0 Grades6-8 Mr.

2 rows × 21 columns

In [78]:
project_data["teacher_prefix"].fillna(" ", inplace = True)
In [81]:
my_counter = Counter()
for teacher_prefix in project_data['clean_teacher_prefix'].values:
    teacher_prefix = str(teacher_prefix)
    my_counter.update(teacher_prefix.split())
In [82]:
teacher_prefix_cat_dict = dict(my_counter)
sorted_teacher_prefix_cat_dict = dict(sorted(teacher_prefix_cat_dict.items(), key=lambda kv: kv[1]))
In [83]:
#https://stackoverflow.com/questions/39303912/tfidfvectorizer-in-scikit-learn-valueerror-np-nan-is-an-invalid-document/39308809#39308809
vectorizer = CountVectorizer(vocabulary=list(sorted_teacher_prefix_cat_dict.keys()), lowercase=False, binary=True)
vectorizer.fit(project_data['clean_teacher_prefix'].values.astype("U"))
#teacher_prefix_new = project_data['teacher_prefix'].notnull()
print(vectorizer.get_feature_names())

teacher_prefix_categories_one_hot = vectorizer.transform(project_data['clean_teacher_prefix'].values.astype("U"))
print("Shape of matrix after one hot encoding ",teacher_prefix_categories_one_hot.shape)
['Dr.', 'Teacher', 'Mr.', 'Ms.', 'Mrs.']
Shape of matrix after one hot encoding  (109248, 5)

1.4.2.2 BOW ON PROJECT TITLE

In [84]:
# merge two column text dataframe: 
project_data["essay"] = project_data["project_essay_1"].map(str) +\
                        project_data["project_essay_2"].map(str) + \
                        project_data["project_essay_3"].map(str) + \
                        project_data["project_essay_4"].map(str)
In [85]:
# https://stackoverflow.com/a/47091490/4084039
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase
In [86]:
sent = decontracted(project_data['essay'].values[20000])
print(sent)
print("="*50)
My kindergarten students have varied disabilities ranging from speech and language delays, cognitive delays, gross/fine motor delays, to autism. They are eager beavers and always strive to work their hardest working past their limitations. \r\n\r\nThe materials we have are the ones I seek out for my students. I teach in a Title I school where most of the students receive free or reduced price lunch.  Despite their disabilities and limitations, my students love coming to school and come eager to learn and explore.Have you ever felt like you had ants in your pants and you needed to groove and move as you were in a meeting? This is how my kids feel all the time. The want to be able to move as they learn or so they say.Wobble chairs are the answer and I love then because they develop their core, which enhances gross motor and in Turn fine motor skills. \r\nThey also want to learn through games, my kids do not want to sit and do worksheets. They want to learn to count by jumping and playing. Physical engagement is the key to our success. The number toss and color and shape mats can make that happen. My students will forget they are doing work and just have the fun a 6 year old deserves.nannan
==================================================
In [87]:
# \r \n \t remove from string python: http://texthandler.com/info/remove-line-breaks-python/
sent = sent.replace('\\r', ' ')
sent = sent.replace('\\"', ' ')
sent = sent.replace('\\n', ' ')
print(sent)
My kindergarten students have varied disabilities ranging from speech and language delays, cognitive delays, gross/fine motor delays, to autism. They are eager beavers and always strive to work their hardest working past their limitations.     The materials we have are the ones I seek out for my students. I teach in a Title I school where most of the students receive free or reduced price lunch.  Despite their disabilities and limitations, my students love coming to school and come eager to learn and explore.Have you ever felt like you had ants in your pants and you needed to groove and move as you were in a meeting? This is how my kids feel all the time. The want to be able to move as they learn or so they say.Wobble chairs are the answer and I love then because they develop their core, which enhances gross motor and in Turn fine motor skills.   They also want to learn through games, my kids do not want to sit and do worksheets. They want to learn to count by jumping and playing. Physical engagement is the key to our success. The number toss and color and shape mats can make that happen. My students will forget they are doing work and just have the fun a 6 year old deserves.nannan
In [88]:
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]
In [89]:
from tqdm import tqdm
preprocessed_essays = []
# tqdm is for printing the status bar
for sentance in tqdm(project_data['essay'].values):
    sent = decontracted(sentance)
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('\\n', ' ')
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    # https://gist.github.com/sebleier/554280
    sent = ' '.join(e for e in sent.split() if e not in stopwords)
    preprocessed_essays.append(sent.lower().strip())
100%|█████████████████████████████████████████████████████████████████████████| 109248/109248 [02:27<00:00, 740.15it/s]
In [91]:
vectorizer = CountVectorizer(min_df=10)
text_bow = vectorizer.fit_transform(preprocessed_essays)
print("Shape of matrix after one hot encoding ",text_bow.shape)
Shape of matrix after one hot encoding  (109248, 16623)
In [92]:
preprocessed_titles = []

for titles in tqdm(project_data["project_title"]):
    title = decontracted(titles)
    title = title.replace('\\r', ' ')
    title = title.replace('\\"', ' ')
    title = title.replace('\\n', ' ')
    title = re.sub('[^A-Za-z0-9]+', ' ', title)
    title = ' '.join(f for f in title.split() if f not in stopwords)
    preprocessed_titles.append(title.lower().strip())
100%|███████████████████████████████████████████████████████████████████████| 109248/109248 [00:07<00:00, 13804.69it/s]
In [94]:
vectorizer = CountVectorizer(min_df=10)
text_bow = vectorizer.fit_transform(preprocessed_titles)
print("Shape of matrix after one hot encoding ",text_bow.shape)
Shape of matrix after one hot encoding  (109248, 3329)

'''TFIDF on Project_title:'''

In [95]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=10)
text_tfidf = vectorizer.fit_transform(preprocessed_essays)
print("Shape of matrix after one hot encodig ",text_tfidf.shape)
Shape of matrix after one hot encodig  (109248, 16623)
In [96]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=20)
text_tfidf = vectorizer.fit_transform(preprocessed_titles)
print("Shape of matrix after one hot encodig ",text_tfidf.shape)
Shape of matrix after one hot encodig  (109248, 2190)
In [101]:
'''
# Reading glove vectors in python: https://stackoverflow.com/a/38230349/4084039
def loadGloveModel(gloveFile):
    print ("Loading Glove Model")
    f = open(gloveFile,'r', encoding="utf8")
    model = {}
    for line in tqdm(f):
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print ("Done.",len(model)," words loaded!")
    return model
model = loadGloveModel('glove.42B.300d.txt')

# ============================
Output:
    
Loading Glove Model
1917495it [06:32, 4879.69it/s]
Done. 1917495  words loaded!

# ============================

words = []
for i in preproced_texts:
    words.extend(i.split(' '))

for i in preproced_titles:
    words.extend(i.split(' '))
print("all the words in the coupus", len(words))
words = set(words)
print("the unique words in the coupus", len(words))

inter_words = set(model.keys()).intersection(words)
print("The number of words that are present in both glove vectors and our coupus", \
      len(inter_words),"(",np.round(len(inter_words)/len(words)*100,3),"%)")

words_courpus = {}
words_glove = set(model.keys())
for i in words:
    if i in words_glove:
        words_courpus[i] = model[i]
print("word 2 vec length", len(words_courpus))


# stronging variables into pickle files python: http://www.jessicayung.com/how-to-use-pickle-to-save-and-load-variables-in-python/

import pickle
with open('glove_vectors', 'wb') as f:
    pickle.dump(words_courpus, f)


'''
Out[101]:
'\n# Reading glove vectors in python: https://stackoverflow.com/a/38230349/4084039\ndef loadGloveModel(gloveFile):\n    print ("Loading Glove Model")\n    f = open(gloveFile,\'r\', encoding="utf8")\n    model = {}\n    for line in tqdm(f):\n        splitLine = line.split()\n        word = splitLine[0]\n        embedding = np.array([float(val) for val in splitLine[1:]])\n        model[word] = embedding\n    print ("Done.",len(model)," words loaded!")\n    return model\nmodel = loadGloveModel(\'glove.42B.300d.txt\')\n\n# ============================\nOutput:\n    \nLoading Glove Model\n1917495it [06:32, 4879.69it/s]\nDone. 1917495  words loaded!\n\n# ============================\n\nwords = []\nfor i in preproced_texts:\n    words.extend(i.split(\' \'))\n\nfor i in preproced_titles:\n    words.extend(i.split(\' \'))\nprint("all the words in the coupus", len(words))\nwords = set(words)\nprint("the unique words in the coupus", len(words))\n\ninter_words = set(model.keys()).intersection(words)\nprint("The number of words that are present in both glove vectors and our coupus",       len(inter_words),"(",np.round(len(inter_words)/len(words)*100,3),"%)")\n\nwords_courpus = {}\nwords_glove = set(model.keys())\nfor i in words:\n    if i in words_glove:\n        words_courpus[i] = model[i]\nprint("word 2 vec length", len(words_courpus))\n\n\n# stronging variables into pickle files python: http://www.jessicayung.com/how-to-use-pickle-to-save-and-load-variables-in-python/\n\nimport pickle\nwith open(\'glove_vectors\', \'wb\') as f:\n    pickle.dump(words_courpus, f)\n\n\n'
In [165]:
# stronging variables into pickle files python: http://www.jessicayung.com/how-to-use-pickle-to-save-and-load-variables-in-python/
# make sure you have the glove_vectors file
with open('glove_vectors', 'rb') as f:
    model = pickle.load(f)
    glove_words =  set(model.keys())
In [166]:
avg_w2v_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(preprocessed_essays): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if word in glove_words:
            vector += model[word]
            cnt_words += 1
    if cnt_words != 0:
        vector /= cnt_words
    avg_w2v_vectors.append(vector)

print(len(avg_w2v_vectors))
print(len(avg_w2v_vectors[0]))
100%|████████████████████████████████████████████████████████████████████████| 109248/109248 [01:11<00:00, 1520.27it/s]
109248
300

'''avg w2v on project_title'''

In [167]:
avg_w2v_vectors_titles = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(preprocessed_titles): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if word in glove_words:
            vector += model[word]
            cnt_words += 1
    if cnt_words != 0:
        vector /= cnt_words
    avg_w2v_vectors_titles.append(vector)

print(len(avg_w2v_vectors_titles))
print(len(avg_w2v_vectors_titles[0]))
100%|███████████████████████████████████████████████████████████████████████| 109248/109248 [00:03<00:00, 32073.85it/s]
109248
300
In [168]:
tfidf_model = TfidfVectorizer()
tfidf_model.fit(preprocessed_essays)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())
In [107]:
tfidf_w2v_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(preprocessed_essays): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    tf_idf_weight =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
            tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    tfidf_w2v_vectors.append(vector)

print(len(tfidf_w2v_vectors))
print(len(tfidf_w2v_vectors[0]))
100%|█████████████████████████████████████████████████████████████████████████| 109248/109248 [08:45<00:00, 207.82it/s]
109248
300

'''tfidf weighted w2v on project_title'''

In [169]:
tfidf_model = TfidfVectorizer()
tfidf_model.fit(preprocessed_titles)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())
In [170]:
tfidf_w2v_vectors_titles = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(preprocessed_titles): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    tf_idf_weight =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
            tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    tfidf_w2v_vectors_titles.append(vector)

print(len(tfidf_w2v_vectors_titles))
print(len(tfidf_w2v_vectors_titles[0]))
100%|███████████████████████████████████████████████████████████████████████| 109248/109248 [00:08<00:00, 12208.26it/s]
109248
300
In [115]:
# we get the cost of the project using resource.csv file
resource_data.head(2)
Out[115]:
id description quantity price
0 p233245 LC652 - Lakeshore Double-Space Mobile Drying Rack 1 149.00
1 p069063 Bouncy Bands for Desks (Blue support pipes) 3 14.95
In [121]:
# https://stackoverflow.com/questions/22407798/how-to-reset-a-dataframes-indexes-for-all-groups-in-one-step
price_data = resource_data.groupby('id').agg({'price':'sum', 'quantity':'sum'}).reset_index()
price_data.head(2)
Out[121]:
id price quantity
0 p000001 459.56 7
1 p000002 515.89 21
In [122]:
# join two dataframes in python: 
project_data = pd.merge(project_data, price_data, on='id', how='left')
In [123]:
approved_price = project_data[project_data['project_is_approved']==1]['price'].values

rejected_price = project_data[project_data['project_is_approved']==0]['price'].values
In [124]:
# check this one: https://www.youtube.com/watch?v=0HOqOcln3Z4&t=530s
# standardization sklearn: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
from sklearn.preprocessing import StandardScaler
price_scalar = StandardScaler()
price_scalar.fit(project_data['price'].values.reshape(-1,1)) # finding the mean and standard deviation of this data
print(f"Mean : {price_scalar.mean_[0]}, Standard deviation : {np.sqrt(price_scalar.var_[0])}")

# Now standardize the data with above maen and variance.
price_standardized = price_scalar.transform(project_data['price'].values.reshape(-1, 1))
price_standardized
Mean : 298.1193425966608, Standard deviation : 367.49634838483496
Out[124]:
array([[-0.3905327 ],
       [ 0.00239637],
       [ 0.59519138],
       ...,
       [-0.15825829],
       [-0.61243967],
       [-0.51216657]])

'''vectorizing categorical data of projects'''

''' In teacher grade category and teacher_prefix we are dropping a column to get the exact number of values after one hot encoding of these features.'''

'''we removed the null(Nan)column in the teacher_prefix to avoid errors while execution.'''

In [126]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(vocabulary=list(sorted_cat_dict.keys()), lowercase=False, binary=True)
vectorizer.fit(project_data['clean_categories'].values)
print(vectorizer.get_feature_names())
categories_one_hot = vectorizer.transform(project_data['clean_categories'].values)
print("Shape of matrix after one hot encoding ",categories_one_hot.shape)
['Warmth', 'Care_Hunger', 'History_Civics', 'Music_Arts', 'AppliedLearning', 'SpecialNeeds', 'Health_Sports', 'Math_Science', 'Literacy_Language']
Shape of matrix after one hot encoding  (109248, 9)
In [127]:
'''vectorizing categorical data-clean sub categories of projects'''
vectorizer = CountVectorizer(vocabulary=list(sorted_sub_cat_dict.keys()), lowercase=False, binary=True)
vectorizer.fit(project_data['clean_subcategories'].values)
print(vectorizer.get_feature_names())


sub_categories_one_hot = vectorizer.transform(project_data['clean_subcategories'].values)
print("Shape of matrix after one hot encoding ",sub_categories_one_hot.shape)
['Economics', 'CommunityService', 'FinancialLiteracy', 'ParentInvolvement', 'Extracurricular', 'Civics_Government', 'ForeignLanguages', 'NutritionEducation', 'Warmth', 'Care_Hunger', 'SocialSciences', 'PerformingArts', 'CharacterEducation', 'TeamSports', 'Other', 'College_CareerPrep', 'Music', 'History_Geography', 'Health_LifeScience', 'EarlyDevelopment', 'ESL', 'Gym_Fitness', 'EnvironmentalScience', 'VisualArts', 'Health_Wellness', 'AppliedSciences', 'SpecialNeeds', 'Literature_Writing', 'Mathematics', 'Literacy']
Shape of matrix after one hot encoding  (109248, 30)

'''one hot code for --school states'''

In [128]:
my_counter = Counter()
for state in project_data['school_state'].values:
    my_counter.update(state.split())
    
school_state_cat_dict = dict(my_counter)
sorted_school_state_cat_dict = dict(sorted(school_state_cat_dict.items(), key=lambda kv: kv[1]))
In [129]:
vectorizer = CountVectorizer(vocabulary=list(sorted_school_state_cat_dict.keys()), lowercase=False, binary=True)
vectorizer.fit(project_data['school_state'].values)
print(vectorizer.get_feature_names())

school_state_categories_one_hot = vectorizer.transform(project_data['school_state'].values)
print("Shape of matrix after one hot encoding ",school_state_categories_one_hot.shape)
['VT', 'WY', 'ND', 'MT', 'RI', 'SD', 'NE', 'DE', 'AK', 'NH', 'WV', 'ME', 'HI', 'DC', 'NM', 'KS', 'IA', 'ID', 'AR', 'CO', 'MN', 'OR', 'KY', 'MS', 'NV', 'MD', 'CT', 'TN', 'UT', 'AL', 'WI', 'VA', 'AZ', 'NJ', 'OK', 'WA', 'MA', 'LA', 'OH', 'MO', 'IN', 'PA', 'MI', 'SC', 'GA', 'IL', 'NC', 'FL', 'NY', 'TX', 'CA']
Shape of matrix after one hot encoding  (109248, 51)

'''one hot code for project_grade_category'''

In [130]:
my_counter = Counter()
for project_grade in project_data['clean_grade_category'].values:
    my_counter.update(project_grade.split())
    
project_grade_cat_dict = dict(my_counter)
sorted_project_grade_cat_dict = dict(sorted(project_grade_cat_dict.items(), key=lambda kv: kv[1]))
In [141]:
vectorizer = CountVectorizer(vocabulary=list(sorted_project_grade_cat_dict.keys()), lowercase=False, binary=True)
vectorizer.fit(project_data['clean_grade_category'].values)
print(vectorizer.get_feature_names())

clean_grade_categories_one_hot = vectorizer.transform(project_data['clean_grade_category'].values)
print("Shape of matrix after one hot encoding ",clean_grade_categories_one_hot.shape)
['Grades9-12', 'Grades6-8', 'Grades3-5', 'GradesPreK-2']
Shape of matrix after one hot encoding  (109248, 4)

'''one hot code for --teacher prefix'''

In [143]:
my_counter = Counter()
for teacher_prefix in project_data['clean_teacher_prefix'].values:
    teacher_prefix = str(teacher_prefix)
    my_counter.update(teacher_prefix.split())

teacher_prefix_cat_dict = dict(my_counter)
sorted_teacher_prefix_cat_dict = dict(sorted(teacher_prefix_cat_dict.items(), key=lambda kv: kv[1]))
In [144]:
vectorizer = CountVectorizer(vocabulary=list(sorted_teacher_prefix_cat_dict.keys()), lowercase=False, binary=True)
vectorizer.fit(project_data['clean_teacher_prefix'].values.astype("U"))
print(vectorizer.get_feature_names())

clean_teacher_prefix_categories_one_hot = vectorizer.transform(project_data['clean_teacher_prefix'].values.astype("U"))
print("Shape of matrix after one hot encoding ",clean_teacher_prefix_categories_one_hot.shape)
['Dr.', 'Teacher', 'Mr.', 'Ms.', 'Mrs.']
Shape of matrix after one hot encoding  (109248, 5)
In [134]:
print(categories_one_hot.shape)
print(sub_categories_one_hot.shape)
print(text_bow.shape)
print(price_standardized.shape)
(109248, 9)
(109248, 30)
(109248, 3329)
(109248, 1)
In [135]:
# merge two sparse matrices: https://stackoverflow.com/a/19710648/4084039
from scipy.sparse import hstack
# with the same hstack function we are concatinating a sparse matrix and a dense matirx :)
X = hstack((categories_one_hot, sub_categories_one_hot, text_bow, price_standardized))
X.shape
Out[135]:
(109248, 3369)
In [136]:
from sklearn.preprocessing import StandardScaler
price_scalar = StandardScaler()
price_scalar.fit(project_data['price'].values.reshape(-1,1)) # finding the mean and standard deviation of this data
print("Mean : {}".format(price_scalar.mean_[0]))
print("Standard deviation : {}".format(np.sqrt(price_scalar.var_[0])))
# Now standardize the data with above maen and variance.
price_standardized = price_scalar.transform(project_data['price'].values.reshape(-1, 1))
Mean : 298.1193425966608
Standard deviation : 367.49634838483496
In [137]:
import warnings
warnings.filterwarnings("ignore")
prev_projects_scalar = StandardScaler()

prev_projects_scalar.fit(project_data['teacher_number_of_previously_posted_projects'].values.reshape(-1,1)) 

print("Mean : {}".format(prev_projects_scalar.mean_[0]))

print("Standard deviation : {}".format(np.sqrt(prev_projects_scalar.var_[0])))

prev_projects_standardized = prev_projects_scalar.transform(project_data['teacher_number_of_previously_posted_projects'].
values.reshape(-1, 1))

prev_projects_standardized
Mean : 11.153165275336848
Standard deviation : 27.77702641477403
Out[137]:
array([[-0.40152481],
       [-0.14951799],
       [-0.36552384],
       ...,
       [-0.29352189],
       [-0.40152481],
       [-0.40152481]])
In [138]:
vectorizer = CountVectorizer(min_df=5)
title_bow = vectorizer.fit_transform(preprocessed_titles)
print("Shape of matrix after one hot encoding ",title_bow.shape)
Shape of matrix after one hot encoding  (109248, 5107)
In [139]:
import warnings
warnings.filterwarnings("ignore")

quantity_scalar = StandardScaler()

## Finding the mean and standard deviation of this data
quantity_scalar.fit(project_data['quantity'].values.reshape(-1,1)) 

print("Mean : {}".format(quantity_scalar.mean_[0]))

print("Standard deviation : {}".format(np.sqrt(quantity_scalar.var_[0])))

# Now standardize the data with above maen and variance.
quantity_standardized = quantity_scalar.transform(project_data['quantity'].values.reshape(-1, 1))
Mean : 16.965610354422964
Standard deviation : 26.182821919093175

'''tsne Bag of words encoding project title feature'''

In [145]:
X = hstack((categories_one_hot, sub_categories_one_hot, 
            school_state_categories_one_hot, 
            clean_grade_categories_one_hot, 
            clean_teacher_prefix_categories_one_hot, 
            price_standardized, 
            quantity_standardized, 
            prev_projects_standardized, 
            title_bow))
X.shape
Out[145]:
(109248, 5209)
In [146]:
from sklearn.manifold import TSNE
X = X.tocsr()
X_new = X[0:5000,:]

X_new = X_new.toarray()
model = TSNE(n_components = 2, perplexity = 100.0, random_state = 0)
tsne_data_b = model.fit_transform(X_new)

labels = project_data["project_is_approved"]
labels_new = labels[0: 5000]
len(labels_new)
Out[146]:
5000

we can see that the data is completely overlapped, and the points are scattered and formed into group of clusters.

In [147]:
tsne_data_b = np.vstack((tsne_data_b.T, labels_new)).T
tsne_df_b = pd.DataFrame(tsne_data_b, columns = ("1st_Dim","2nd_Dim","Labels"))

tsne_df_b.shape
sns.FacetGrid(tsne_df_b, hue = "Labels", size = 10).map(plt.scatter, "1st_Dim", "2nd_Dim").add_legend().fig.suptitle("TSNE WITH BOW ENCODING OF PROJECT TITLE FEATURE ")
plt.show()

'''tsne for tfidf encoding project title'''

In [149]:
'''tsne for tfidf encoding project title'''
X = hstack((categories_one_hot, 
            sub_categories_one_hot,
            school_state_categories_one_hot,
            clean_grade_categories_one_hot,
            clean_teacher_prefix_categories_one_hot, 
            price_standardized, 
            quantity_standardized, 
            prev_projects_standardized,
            text_tfidf))
X.shape
Out[149]:
(109248, 2292)
In [151]:
X = X.tocsr()
X_new = X[0:5000,:]
X_new = X_new.toarray()
model = TSNE(n_components = 2, perplexity = 100.0, random_state = 0)
tsne_data_tfidf = model.fit_transform(X_new)
tsne_data_tfidf = np.vstack((tsne_data_tfidf.T, labels_new)).T
tsne_df_tfidf = pd.DataFrame(tsne_data_tfidf, columns = ("1st_Dim","2nd_Dim","Labels"))
tsne_df_tfidf.shape
sns.FacetGrid(tsne_df_tfidf, hue = "Labels", size = 10).map(plt.scatter, "1st_Dim", "2nd_Dim").add_legend().fig.suptitle("TSNE WITH TF-IDF ENCODING OF PROJECT TITLE FEATURE ")
plt.show()

we can observe that the points are scattered in different directions and cannot seem to find the conclusion

tsne for avg w2v encoding project title

In [152]:
'''tsne for avg w2v encoding project title'''
X = hstack((categories_one_hot, 
            sub_categories_one_hot, 
            school_state_categories_one_hot, 
            project_grade_categories_one_hot, 
            teacher_prefix_categories_one_hot, 
            price_standardized,
            quantity_standardized,
            prev_projects_standardized,
            avg_w2v_vectors_titles))
X.shape
Out[152]:
(109248, 402)
In [153]:
X = X.tocsr()
X_new = X[0:5000,:]
X_new = X_new.toarray()
model = TSNE(n_components = 2, perplexity = 100.0, random_state = 0)
tsne_data_avg_w2v = model.fit_transform(X_new)
In [154]:
tsne_data_avg_w2v = np.vstack((tsne_data_avg_w2v.T, labels_new)).T
tsne_df_avg_w2v = pd.DataFrame(tsne_data_avg_w2v, columns = ("1st_Dim","2nd_Dim","Labels"))
In [155]:
tsne_df_avg_w2v.shape
Out[155]:
(5000, 3)
In [156]:
sns.FacetGrid(tsne_df_avg_w2v, hue = "Labels", size = 10).map(plt.scatter, "1st_Dim", "2nd_Dim").add_legend().fig.suptitle("TSNE WITH AVG W2V ENCODING OF PROJECT TITLE FEATURE ")
plt.show()
In [172]:
#https://stackoverflow.com/questions/27431390/typeerror-zip-object-is-not-subscriptable
def text_from_tagged_ngram(ngram): 
    if type(ngram) == tuple:
        return ngram[0]
    return " ".join(zip(*ngram)[0])

this tsne of average w2v and project title we cannot observe any clusters and also cannot find weather the project is accepted or not

In [173]:
tfidf_model = TfidfVectorizer()
tfidf_model.fit(preprocessed_titles)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())
In [174]:
tfidf_w2v_vectors_title = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(preprocessed_titles): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    tf_idf_weight =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
            tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    tfidf_w2v_vectors_title.append(vector)
    
print(len(tfidf_w2v_vectors_title))
print(len(tfidf_w2v_vectors_title[0]))
100%|███████████████████████████████████████████████████████████████████████| 109248/109248 [00:07<00:00, 15097.31it/s]
109248
300

'''TSNE with TFIDF Weighted W2V encoding of project_title feature'''

In [175]:
X = hstack((categories_one_hot, 
            sub_categories_one_hot, 
            school_state_categories_one_hot, 
            clean_grade_categories_one_hot, 
            clean_teacher_prefix_categories_one_hot, 
            price_standardized, 
            quantity_standardized, 
            prev_projects_standardized, 
            tfidf_w2v_vectors_title))
X.shape
Out[175]:
(109248, 402)
In [176]:
X = X.tocsr()
X_new = X[0:5000,:]
X_new = X_new.toarray()
model = TSNE(n_components = 2, perplexity = 100.0, random_state = 0)
tsne_data_tfidf_w2v = model.fit_transform(X_new)
tsne_data_tfidf_w2v = np.vstack((tsne_data_tfidf_w2v.T, labels_new)).T
tsne_df_tfidf_w2v = pd.DataFrame(tsne_data_tfidf_w2v, columns = ("1st_Dim","2nd_Dim","Labels"))
tsne_df_tfidf_w2v.shape

sns.FacetGrid(tsne_df_tfidf_w2v, hue = "Labels", size = 10).map(plt.scatter, "1st_Dim", "2nd_Dim").add_legend().fig.suptitle("TSNE WITH TF-IDF WEIGHTED W2V ENCODING OF PROJECT TITLE FEATURE ")
plt.show()

these combinations are not clearly drawing us through the results and we might have to try more combinations for desired result.

In [178]:
X = hstack((categories_one_hot, sub_categories_one_hot, school_state_categories_one_hot,
project_grade_categories_one_hot, teacher_prefix_categories_one_hot, price_standardized,
quantity_standardized, prev_projects_standardized, title_bow, text_tfidf, avg_w2v_vectors_titles,
tfidf_w2v_vectors_title))
X.shape
Out[178]:
(109248, 7999)
In [190]:
X = X.tocsr()
X_new = X[0:5000,:]
tsne_data_complete = model.fit_transform(X_new.toarray())
In [191]:
tsne_data_complete= np.vstack((tsne_data_complete.T, labels_new)).T
tsne_df_complete = pd.DataFrame(tsne_data_complete, columns = ("1st_Dim","2nd_Dim","Labels"))
In [192]:
tsne_df_complete.shape
Out[192]:
(5000, 3)
In [198]:
sns.FacetGrid(tsne_df_complete, hue = "Labels", size = 10).map(plt.scatter, "1st_Dim", "2nd_Dim").add_legend().fig.suptitle("TSNE WITH BOW, TF-IDF, AVG W2V, TF-IDF WEIGHTED W2V ENCODING OF PROJECT TITLE FEATURE ")
plt.show()

These combinations are not clearly drawing us through the results and we might have to try more combinations for desired result of clustering similar data points.

conclusion:

1. DE state from united states is having highest percentage of projects which are accepted in the whole country and almost it has 90% of the acceptance rate. and ND is having 89% and WA is having 88% respectively.
2.VT is having the lowest approval rate with 80% followed by DC and TX with 80 % and 81%.
3.female teachers are having the max number of projects proposed and accepted when compared to the male teachers.
4.there are lot of projects proposed for pre kindergarden adn 2nd grade while the rest it is decreasing as the increase in grade can be seen.
5.We  can  also notice that the Students between the 9th Grade and 12th Grade have the lowest number of projects proposed and accepted.
6. Projects belonging to Literacy and Language categories have the highest number of projects proposed and The maximum number of projects that are accepted also belong to the same category, with the acceptance rate of 87%.
7. Projects belonging to Maths and Science are having acceptance rate of 82% while the concept of Literacy and Language to this can increase its accpetance rate to 87%
8. Projects belonging to both Maths and Science when combined with the Applied Learning is having the least number of projects proposed and approved.
9. There is Variability in Acceptance rate, projects under the category Warmth, Care and Hunger have an accpetance rate of 93.5%
10. The highest number of projects are registered under Literacy and Langauage with 52,239 projects, followed by Maths and Science having 41,421 projects.
11. The sub-Category Literacy has the highest number of projects approved with 8371 projects. Also the accpetance rate is 88%.
12. The sub-Category Health and Wellness have the lowest number of projects proposed with 3,583 projects only.
13. Roughly most of the projects have 3, 4 or 5 words in the title.There are hardly any project titles containing more than 10 words.
14. The number of words in the Project Essays of Approved Projects are slightly more than the number of words in the Project Essays of the Rejected Projects.
15. The Maximum price for any project should be less than 10,000 dollars.The approved projects tend to have lower cost when compared to the projects that have not been approved.
16. We observe that it is not mandatory for a teacher to have proposed any project prior. Maximum number of teachers, nearly 82% of the approved projects have been submitted by teachers with no prior project proposals. New talent and efforts are well appreciated.
17. Very few teachers who have proposed more than 20 projects have got approval. But the rate of approval is Higher given the teacher has proposed atleast 19 different projects.
18. The project summaries containing numeric values have a very high acceptance rate of 90%. Well, proper numbered requirements suggest clarity in the proposals and hence Alot of people tend to donate for a better cause, that is to help children.
19. We observe that on an average Each project costs nearly 298 Dollars. The Price paid is generally for the purchase of the Items. The projects on an average require atleast 17 Different of similar items.
20. Visualisation of TSNE with Bag of Words, TF-IDF, Avg Word2Vec, TF-IDF Weighted Word2Vec does not seem to yield the expected result of clustering similar data points. Hence we would have to try any other method.


This is all about the donars choose data set and it analysis. the above drawn conclusions are drawn from the observations done in the sample data set and also by performing some of the operations on the given data.